{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "def _pad_sequence(\n",
    "    sequence,\n",
    "    n,\n",
    "    pad_left = False,\n",
    "    pad_right = False,\n",
    "    left_pad_symbol = None,\n",
    "    right_pad_symbol = None,\n",
    "):\n",
    "    sequence = iter(sequence)\n",
    "    if pad_left:\n",
    "        sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)\n",
    "    if pad_right:\n",
    "        sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))\n",
    "    return sequence\n",
    "\n",
    "def ngrams(\n",
    "    sequence,\n",
    "    n: int,\n",
    "    pad_left = False,\n",
    "    pad_right = False,\n",
    "    left_pad_symbol = None,\n",
    "    right_pad_symbol = None,\n",
    "):\n",
    "    sequence = _pad_sequence(\n",
    "        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol\n",
    "    )\n",
    "\n",
    "    history = []\n",
    "    while n > 1:\n",
    "        try:\n",
    "            next_item = next(sequence)\n",
    "        except StopIteration:\n",
    "            return\n",
    "        history.append(next_item)\n",
    "        n -= 1\n",
    "    for item in sequence:\n",
    "        history.append(item)\n",
    "        yield tuple(history)\n",
    "        del history[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('saya', 'suka'), ('suka', 'makan'), ('makan', 'ayam')]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(ngrams(['saya', 'suka', 'makan', 'ayam'], 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['kelantan-words.json',\n",
       " 'negeri-sembilan-words.json',\n",
       " 'melaka-words.json',\n",
       " 'johor-words.json',\n",
       " 'pahang-words.json',\n",
       " 'kedah-words.json',\n",
       " 'perak-words.json',\n",
       " 'terengganu-words.json',\n",
       " 'english-words.json',\n",
       " 'sabah-words.json',\n",
       " 'sarawak-words.json']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "languages = glob('*-words.json')\n",
    "languages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wok lor\n",
      "mahkamah tinggi ayoh kob\n",
      "lorong tua sebutan bunyi dihidung\n",
      "jjughuh baik jjughuh budok tu baik budak itu\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "8"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(languages[0]) as fopen:\n",
    "    lang = set(json.load(fopen))\n",
    "    \n",
    "longest = 0\n",
    "for l in lang:\n",
    "    ls = len(l.split())\n",
    "    if ls > longest:\n",
    "        print(l)\n",
    "        longest = ls\n",
    "        \n",
    "longest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7230902"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('social-language.json') as fopen:\n",
    "    malay = json.load(fopen)['malay']\n",
    "    \n",
    "len(malay)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = []\n",
    "\n",
    "languages = glob('*-words.json')\n",
    "for language in languages:\n",
    "    if 'english' in language:\n",
    "        continue\n",
    "    label = language.replace('-words.json', '')\n",
    "    \n",
    "    with open(language) as fopen:\n",
    "        lang = json.load(fopen)\n",
    "    words.append(lang)\n",
    "    \n",
    "len(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "set()"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set.intersection(*map(set,words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('malays_word.json') as fopen:\n",
    "    malays = set(json.load(fopen))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kelantan-words.json\n",
      "503\n",
      "420\n",
      "wok lor\n",
      "mahkamah tinggi ayoh kob\n",
      "lorong tua sebutan bunyi dihidung\n",
      "jjughuh baik jjughuh budok tu baik budak itu\n",
      "negeri-sembilan-words.json\n",
      "875\n",
      "801\n",
      "melaka-words.json\n",
      "375\n",
      "273\n",
      "johor-words.json\n",
      "215\n",
      "171\n",
      "pahang-words.json\n",
      "294\n",
      "201\n",
      "kedah-words.json\n",
      "2049\n",
      "2001\n",
      "perak-words.json\n",
      "150\n",
      "121\n",
      "terengganu-words.json\n",
      "351\n",
      "283\n",
      "sabah-words.json\n",
      "124\n",
      "64\n",
      "sarawak-words.json\n",
      "209\n",
      "165\n"
     ]
    }
   ],
   "source": [
    "languages_dict = {}\n",
    "longest = 0\n",
    "languages = glob('*-words.json')\n",
    "for language in languages:\n",
    "    if 'english' in language:\n",
    "        continue\n",
    "    print(language)\n",
    "    label = language.replace('-words.json', '')\n",
    "    \n",
    "    with open(language) as fopen:\n",
    "        lang = set(json.load(fopen))\n",
    "    \n",
    "    print(len(lang))\n",
    "    lang = lang - malays\n",
    "    print(len(lang))\n",
    "\n",
    "    \n",
    "    for l in lang:\n",
    "        ls = len(l.split())\n",
    "        if ls > longest:\n",
    "            print(l)\n",
    "            longest = ls\n",
    "            \n",
    "    languages_dict[label] = lang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 7230902/7230902 [09:52<00:00, 12193.86it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "results = {}\n",
    "\n",
    "for s in tqdm(malay):\n",
    "    splitted = s.split()\n",
    "    ngs = splitted[:]\n",
    "    for i in range(2, longest):\n",
    "        ngs.extend([' '.join(n) for n in ngrams(splitted, i)])\n",
    "    \n",
    "    found = False\n",
    "    for k, v in languages_dict.items():\n",
    "        r = set(ngs) & v\n",
    "        if len(r):\n",
    "            # print(s, k, r)\n",
    "            found = True\n",
    "            if k in results:\n",
    "                results[k].append(s)\n",
    "            else:\n",
    "                results[k] = [s]\n",
    "            break\n",
    "    \n",
    "    if not found:\n",
    "        if 'malay' in results:\n",
    "            results['malay'].append(s)\n",
    "        else:\n",
    "            results['malay'] = [s]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "malay 7153125\n",
      "kedah 36271\n",
      "johor 2575\n",
      "melaka 10393\n",
      "terengganu 4839\n",
      "sarawak 7401\n",
      "negeri-sembilan 7717\n",
      "kelantan 2438\n",
      "perak 1296\n",
      "pahang 3596\n",
      "sabah 1251\n"
     ]
    }
   ],
   "source": [
    "for k, v in results.items():\n",
    "    print(k, len(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from random import sample\n",
    "\n",
    "results['malay'] = sample(results['malay'],100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('sublanguages.json', 'w') as fopen:\n",
    "    json.dump(results, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('sublanguages-malay.json', 'w') as fopen:\n",
    "#     json.dump(malays, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "\n",
    "bucketName = 'malaya-dataset'\n",
    "Key = 'sublanguages.json'\n",
    "outPutname = \"language-detection/sublanguages.json\"\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
